Introduction

In this project, we will do a study on data analysis and applications on 30 different stocks in Borsa Istanbul, their prices and corresponding time periods. Additionally, starting from December 24th, we will make a 10-hour stock price prediction for 30 stocks (300 in total) every day. We will analyze these predictions with different Data Mining/Machine Learning algorithms and examine which one might be more efficient to use in this report. Since there are not much raw columns in data, we need to somehow obtain an extra information from existing features or from another sources (such as Yahoo Finance using Quantmod library).

The data includes hourly average prices of selected stocks from Borsa İstanbul, presented in separate csv files for each time interval, encompassing about 4 years of data in long format. We will going to find ways to handle this long format since we do not want other stocks affects a specific stock’s price in negative way.

Let’s start with initialization.

Loading required libraries & Data.

At first, We loaded the data to the R Environment to complete the parts of the question. Also, loaded some libraries providing the visualizations to interpret the data.

library(data.table)
library(quantmod) # for Yahoo Finance library
library(ggplot2)
library(dplyr)
library(lubridate)
library(plotly)
library(rpart.plot)
library(skimr)
library(GGally)
library(caret) # for machine learning and predictive modeling
library(rpart) # for rpart
library(gbm) # for Boosted Decision Trees
library(randomForest) # for random forests
library(reshape2) # for graphs
library(FNN)
library(class)
require(rattle)
library(pROC)
library(pdp)
library(MLmetrics)
library(magick)
library(forecast)
library(cluster)
library(plotly)
library(factoextra)
library(Metrics)

Note that the path is set to the folder in my computer. Please change it to run the commands properly.

As can be seen that there are 24 different csv files including historical stock prices of 30 stocks from 2018 to 2023. With the script below, they will be merged to a single data frame.

path <- "C:/Users/anil.turgut/Desktop/582Project/Data"

# Get a list of CSV files in the directory
files <- list.files(path, pattern = "\\.csv", full.names = TRUE)
files
##  [1] "C:/Users/anil.turgut/Desktop/582Project/Data/20180101_20180401_bist30.csv"
##  [2] "C:/Users/anil.turgut/Desktop/582Project/Data/20180402_20180701_bist30.csv"
##  [3] "C:/Users/anil.turgut/Desktop/582Project/Data/20180702_20180930_bist30.csv"
##  [4] "C:/Users/anil.turgut/Desktop/582Project/Data/20181001_20181230_bist30.csv"
##  [5] "C:/Users/anil.turgut/Desktop/582Project/Data/20181231_20190331_bist30.csv"
##  [6] "C:/Users/anil.turgut/Desktop/582Project/Data/20190401_20190630_bist30.csv"
##  [7] "C:/Users/anil.turgut/Desktop/582Project/Data/20190701_20190929_bist30.csv"
##  [8] "C:/Users/anil.turgut/Desktop/582Project/Data/20190930_20191229_bist30.csv"
##  [9] "C:/Users/anil.turgut/Desktop/582Project/Data/20191230_20200329_bist30.csv"
## [10] "C:/Users/anil.turgut/Desktop/582Project/Data/20200330_20200628_bist30.csv"
## [11] "C:/Users/anil.turgut/Desktop/582Project/Data/20200629_20200927_bist30.csv"
## [12] "C:/Users/anil.turgut/Desktop/582Project/Data/20200928_20201227_bist30.csv"
## [13] "C:/Users/anil.turgut/Desktop/582Project/Data/20201228_20210328_bist30.csv"
## [14] "C:/Users/anil.turgut/Desktop/582Project/Data/20210329_20210627_bist30.csv"
## [15] "C:/Users/anil.turgut/Desktop/582Project/Data/20210628_20210926_bist30.csv"
## [16] "C:/Users/anil.turgut/Desktop/582Project/Data/20210927_20211226_bist30.csv"
## [17] "C:/Users/anil.turgut/Desktop/582Project/Data/20211227_20220327_bist30.csv"
## [18] "C:/Users/anil.turgut/Desktop/582Project/Data/20220328_20220626_bist30.csv"
## [19] "C:/Users/anil.turgut/Desktop/582Project/Data/20220627_20220925_bist30.csv"
## [20] "C:/Users/anil.turgut/Desktop/582Project/Data/20220926_20221225_bist30.csv"
## [21] "C:/Users/anil.turgut/Desktop/582Project/Data/20221226_20230326_bist30.csv"
## [22] "C:/Users/anil.turgut/Desktop/582Project/Data/20230327_20230625_bist30.csv"
## [23] "C:/Users/anil.turgut/Desktop/582Project/Data/20230626_20230924_bist30.csv"
## [24] "C:/Users/anil.turgut/Desktop/582Project/Data/20230925_20231224_bist30.csv"
# Create an empty data frame to store the merged data
data <- data.frame()
# Loop through each file and merge it into the 'data' data frame
for (file in files) {
  # Read the CSV file
  current_data <- read.csv(file, header = TRUE)
  
  # Merge the current data with the existing merged data
  data <- bind_rows(data, current_data)
  
}

head(data)
##                   timestamp price short_name
## 1 2018-01-02 09:00:00+03:00 15.79      THYAO
## 2 2018-01-02 10:00:00+03:00 16.01      THYAO
## 3 2018-01-02 11:00:00+03:00 16.05      THYAO
## 4 2018-01-02 12:00:00+03:00 16.05      THYAO
## 5 2018-01-02 13:00:00+03:00 16.06      THYAO
## 6 2018-01-02 14:00:00+03:00 16.05      THYAO
dim(data)
## [1] 439132      3

We have successfully loaded data as a single dataframe, now move on with the Feature Engineering part.

Feature Engineering & Yahoo Finance Library

We have three features in our dataset which are timestamp, short_name, price. Since these information are not sufficient to create strong learning models, we need to obtain new features using existing ones or from another source.

Also, there are some data modifications that will helps us in further analysis or modeling in below script.

# Changing timestamp's type to POSIX to use as time series feature
data$timestamp <- as.POSIXct(data$timestamp, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
class(data$timestamp)
## [1] "POSIXct" "POSIXt"
# Create a new feature which will be used as a key while merging further information
data$time <- as.Date(format(data$timestamp, "%Y/%m/%d"))


# Extract year, month, day, and hour
data$year <- year(data$timestamp)
data$month <- month(data$timestamp)
data$day <- day(data$timestamp)
data$hour <- hour(data$timestamp)

In below we add a new feature called “forecast_ma_6” which basically includes the Moving Average forecast with 6 records of each stock price. If there are no previous 6 records in the current record, then its forecast will be set as its original price. We except that this new feature contributes while learning algorithms

stock_symbols <- c(
  "THYAO", "AKBNK", "ARCLK", "ASELS", "BIMAS", "DOHOL", "EKGYO", "EREGL", "FROTO", "GUBRF",
  "GARAN", "KRDMD", "KCHOL", "KOZAL", "KOZAA", "PGSUS", "PETKM", "SAHOL", "SASA", "SISE",
  "TAVHL", "TKFEN", "TUPRS", "TTKOM", "TCELL", "HALKB", "ISCTR", "VAKBN", "VESTL", "YKBNK"
)

add_forecast_column <- function(data) {
  data <- data %>%
    arrange(time) %>%
    mutate(`forecast_ma_6` = ifelse(row_number() < 7, price,
                                     (lag(price, 1) + lag(price, 2) + lag(price, 3) +
                                        lag(price, 4) + lag(price, 5) + lag(price, 6)) / 6))
  return(data)
}

# Apply the function for each stock symbol
result_list <- lapply(stock_symbols, function(symbol) {
  symbol_data <- data %>% filter(short_name == symbol)
  symbol_data <- add_forecast_column(symbol_data)
  return(symbol_data)
})

# Combine the results into a single data frame
data <- bind_rows(result_list)

# Moving price column to the last place of dataframe.
data <- data[, c(setdiff(names(data), "price"), "price")]

head(data,8)
##             timestamp short_name       time year month day hour forecast_ma_6
## 1 2018-01-02 09:00:00      THYAO 2018-01-02 2018     1   2    9      15.79000
## 2 2018-01-02 10:00:00      THYAO 2018-01-02 2018     1   2   10      16.01000
## 3 2018-01-02 11:00:00      THYAO 2018-01-02 2018     1   2   11      16.05000
## 4 2018-01-02 12:00:00      THYAO 2018-01-02 2018     1   2   12      16.05000
## 5 2018-01-02 13:00:00      THYAO 2018-01-02 2018     1   2   13      16.06000
## 6 2018-01-02 14:00:00      THYAO 2018-01-02 2018     1   2   14      16.05000
## 7 2018-01-02 15:00:00      THYAO 2018-01-02 2018     1   2   15      16.00167
## 8 2018-01-02 16:00:00      THYAO 2018-01-02 2018     1   2   16      16.05500
##   price
## 1 15.79
## 2 16.01
## 3 16.05
## 4 16.05
## 5 16.06
## 6 16.05
## 7 16.11
## 8 16.13

We are going to use the quantmod library to extract related-meaningful information about our 30 stocks such as volume of the stock for a day. We will analyze whether these extracted features contribute our models or not.

Basically, quantmod is an R package that provides a framework for quantitative financial modeling and trading. It provides a rapid prototyping environment that makes modeling easier by removing the repetitive workflow issues surrounding data management and visualization.

Again, for the specified 30 stocks, start_date and end_date (must be t + 1 since it accepts as exclusive rhs), we are going to extract the information about stocks such as open, close prices, volume etc.

Also, since the quantmod library does not allow us to fetch hourly data more than 7 days, we move on with the daily average volume data of stocks.

% Note that end_date must be updated after new data has been obtained.

# Specify the start and end dates
start_date <- as.Date("2018-01-02")
end_date <- as.Date("2023-11-22")

# Create an empty data frame to store the data
all_stock_data <- data.frame()

# Loop through each stock symbol and retrieve data
for (stock_symbol in stock_symbols) {
  # Append ".IS" to the stock symbol
  full_symbol <- paste0(stock_symbol, ".IS")
  
  # Download stock data
  getSymbols(full_symbol, src = "yahoo", from = start_date, to = end_date)
  
  # Extract hourly data for the specified time range
  hourly_data <- to.hourly(get(full_symbol))
  
  # Append the data to the 'all_stock_data' data frame
  all_stock_data <- rbind(all_stock_data, data.frame(Symbol = stock_symbol, hourly_data))
}

# Remove the prefix "get.full_symbol.." from column names
cleaned_colnames <- gsub("^get.*\\.", "", colnames(all_stock_data))
colnames(all_stock_data) <- cleaned_colnames

all_stock_data$Date <- as.Date(rownames(all_stock_data))
# Print the first few rows of the combined data
head(all_stock_data)
##            Symbol  Open  High   Low Close   Volume Adjusted       Date
## 2018-01-02  THYAO 15.79 16.18 15.67 16.08 44253261    16.08 2018-01-02
## 2018-01-03  THYAO 16.05 16.23 15.92 16.20 40100544    16.20 2018-01-03
## 2018-01-04  THYAO 16.20 16.51 16.18 16.29 54106099    16.29 2018-01-04
## 2018-01-05  THYAO 16.44 16.55 16.18 16.33 55516943    16.33 2018-01-05
## 2018-01-08  THYAO 16.53 16.55 16.29 16.29 28138834    16.29 2018-01-08
## 2018-01-09  THYAO 16.29 16.45 16.10 16.10 34728669    16.10 2018-01-09

Now, we have obtained the 30 stocks’ information between start and end dates. Since we have the 10-hour information of stock prices, we can use the volume information.

To add the volume information to our previous main data, we can use merge() function since there are significant number of records, it would be extremely efficient. Let’s merge them on stock_name and the date (ex. 2022-11-10). Also, we are going to eliminate some of the features such as time from the merged final dataframe.

Moreover, we do not want missing or anomaly volume values in our dataframe. As can be seen below, there are no missing values, luckily. But, we have seen that all stock’s volumes are 0 in a specific day (The day that COVID starts in Turkey). Thus, we are going to eliminate the records whose volume is 0 to inhibit the bias.

# Merge the two dataframes based on the 'short_name' and 'Date' columns
merged_data <- merge(data, all_stock_data[, c("Symbol", "Volume", "Date")], 
                     by.x = c("short_name", "time"), by.y = c("Symbol", "Date"), all.x = TRUE)

# Rename the merged 'Volume' column to 'volume' in the 'data' dataframe
merged_data <- rename(merged_data, volume = Volume)
merged_data <- merged_data %>% arrange(short_name, timestamp)

final_data <- merged_data %>%
  select(timestamp, short_name, year, month, day, hour, volume,"forecast_ma_6", price)

head(final_data)
##             timestamp short_name year month day hour   volume forecast_ma_6
## 1 2018-01-02 09:00:00      AKBNK 2018     1   2    9 22346109        6.9475
## 2 2018-01-02 10:00:00      AKBNK 2018     1   2   10 22346109        7.0602
## 3 2018-01-02 11:00:00      AKBNK 2018     1   2   11 22346109        7.0954
## 4 2018-01-02 12:00:00      AKBNK 2018     1   2   12 22346109        7.0814
## 5 2018-01-02 13:00:00      AKBNK 2018     1   2   13 22346109        7.1024
## 6 2018-01-02 14:00:00      AKBNK 2018     1   2   14 22346109        7.1377
##    price
## 1 6.9475
## 2 7.0602
## 3 7.0954
## 4 7.0814
## 5 7.1024
## 6 7.1377
colSums(sapply(all_stock_data, is.na))
##   Symbol     Open     High      Low    Close   Volume Adjusted     Date 
##        0        0        0        0        0        0        0        0
final_data <- final_data %>%
  filter(volume != 0)

head(final_data,8)
##             timestamp short_name year month day hour   volume forecast_ma_6
## 1 2018-01-02 09:00:00      AKBNK 2018     1   2    9 22346109      6.947500
## 2 2018-01-02 10:00:00      AKBNK 2018     1   2   10 22346109      7.060200
## 3 2018-01-02 11:00:00      AKBNK 2018     1   2   11 22346109      7.095400
## 4 2018-01-02 12:00:00      AKBNK 2018     1   2   12 22346109      7.081400
## 5 2018-01-02 13:00:00      AKBNK 2018     1   2   13 22346109      7.102400
## 6 2018-01-02 14:00:00      AKBNK 2018     1   2   14 22346109      7.137700
## 7 2018-01-02 15:00:00      AKBNK 2018     1   2   15 22346109      7.070767
## 8 2018-01-02 16:00:00      AKBNK 2018     1   2   16 22346109      7.100100
##    price
## 1 6.9475
## 2 7.0602
## 3 7.0954
## 4 7.0814
## 5 7.1024
## 6 7.1377
## 7 7.1235
## 8 7.1235
colSums(sapply(final_data, is.na))
##     timestamp    short_name          year         month           day 
##             0             0             0             0             0 
##          hour        volume forecast_ma_6         price 
##             0             0             0             0

volume is successfully added to the final dataframe, also “Year,Month,Day,Hour” info are extracted from the timestamp feature. There are no missing values. We are ready to move on.

In this part, we need to create 30 different dataframe from the main dataframe since each analysis & modelling will be performed independently from another stock. Thus, we will be using these stocks separately.

thyao_data <- final_data %>% filter(short_name == "THYAO")
akbnk_data <- final_data %>% filter(short_name == "AKBNK")
arclk_data <- final_data %>% filter(short_name == "ARCLK")
asels_data <- final_data %>% filter(short_name == "ASELS")
bimas_data <- final_data %>% filter(short_name == "BIMAS")
dohol_data <- final_data %>% filter(short_name == "DOHOL")
ekgyo_data <- final_data %>% filter(short_name == "EKGYO")
eregl_data <- final_data %>% filter(short_name == "EREGL")
froto_data <- final_data %>% filter(short_name == "FROTO")
gubrf_data <- final_data %>% filter(short_name == "GUBRF")
garan_data <- final_data %>% filter(short_name == "GARAN")
krdmd_data <- final_data %>% filter(short_name == "KRDMD")
kchol_data <- final_data %>% filter(short_name == "KCHOL")
kozal_data <- final_data %>% filter(short_name == "KOZAL")
kozaa_data <- final_data %>% filter(short_name == "KOZAA")
pgsus_data <- final_data %>% filter(short_name == "PGSUS")
petkm_data <- final_data %>% filter(short_name == "PETKM")
sahol_data <- final_data %>% filter(short_name == "SAHOL")
sasa_data <- final_data %>% filter(short_name == "SASA")
sise_data <- final_data %>% filter(short_name == "SISE")
tahvl_data <- final_data %>% filter(short_name == "TAVHL")
tkfen_data <- final_data %>% filter(short_name == "TKFEN")
tuprs_data <- final_data %>% filter(short_name == "TUPRS")
ttkom_data <- final_data %>% filter(short_name == "TTKOM")
tcell_data <- final_data %>% filter(short_name == "TCELL")
halkb_data <- final_data %>% filter(short_name == "HALKB")
isctr_data <- final_data %>% filter(short_name == "ISCTR")
vakbn_data <- final_data %>% filter(short_name == "VAKBN")
vestl_data <- final_data %>% filter(short_name == "VESTL")
ykbnk_data <- final_data %>% filter(short_name == "YKBNK")

Descriptive Analysis

In this part, we are going to understand and analyze the data more by using summary, aggregated functions, sense visualizations etc.

We have data with one POSIX, one CHR and rest either int or num features, which is nice to user. And by looking at the statistical metrics, there are no obvious anomaly/extreme value in the data.

Let’s also plot some visualizations to understand or demonstrate data more. Stock Price Distribution as a boxplot and average price by months in years are also shown below.

dim(final_data)
## [1] 438858      9
str(final_data)
## 'data.frame':    438858 obs. of  9 variables:
##  $ timestamp    : POSIXct, format: "2018-01-02 09:00:00" "2018-01-02 10:00:00" ...
##  $ short_name   : chr  "AKBNK" "AKBNK" "AKBNK" "AKBNK" ...
##  $ year         : num  2018 2018 2018 2018 2018 ...
##  $ month        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ day          : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ hour         : int  9 10 11 12 13 14 15 16 17 18 ...
##  $ volume       : num  22346109 22346109 22346109 22346109 22346109 ...
##  $ forecast_ma_6: num  6.95 7.06 7.1 7.08 7.1 ...
##  $ price        : num  6.95 7.06 7.1 7.08 7.1 ...
summary(final_data)
##    timestamp                       short_name             year     
##  Min.   :2018-01-02 09:00:00.00   Length:438858      Min.   :2018  
##  1st Qu.:2019-06-21 11:00:00.00   Class :character   1st Qu.:2019  
##  Median :2020-12-10 15:00:00.00   Mode  :character   Median :2020  
##  Mean   :2020-12-10 18:36:29.56                      Mean   :2020  
##  3rd Qu.:2022-06-01 12:00:00.00                      3rd Qu.:2022  
##  Max.   :2023-11-21 13:00:00.00                      Max.   :2023  
##      month            day             hour           volume         
##  Min.   : 1.00   Min.   : 1.00   Min.   : 9.00   Min.   :9.000e+00  
##  1st Qu.: 3.00   1st Qu.: 8.00   1st Qu.:11.00   1st Qu.:7.062e+06  
##  Median : 6.00   Median :16.00   Median :13.00   Median :3.042e+07  
##  Mean   : 6.45   Mean   :15.62   Mean   :13.49   Mean   :6.475e+07  
##  3rd Qu.: 9.00   3rd Qu.:23.00   3rd Qu.:16.00   3rd Qu.:8.191e+07  
##  Max.   :12.00   Max.   :31.00   Max.   :18.00   Max.   :1.073e+09  
##  forecast_ma_6          price         
##  Min.   :  0.5844   Min.   :  0.5817  
##  1st Qu.:  5.0200   1st Qu.:  5.0200  
##  Median : 11.2202   Median : 11.2266  
##  Mean   : 32.0167   Mean   : 32.0445  
##  3rd Qu.: 26.4377   3rd Qu.: 26.4749  
##  Max.   :961.7500   Max.   :975.0000
ggplot(final_data, aes(x = short_name, y = price, fill = short_name)) +
  geom_boxplot() +
  labs(title = "Stock Price Distribution",
       x = "Stock",
       y = "Price") +
  theme_minimal() +
  theme(legend.position = "none")

average_price_by_month <- aggregate(price ~ month + year, data = final_data, mean)

ggplot(average_price_by_month, aes(x = month, y = price, fill = factor(year))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Average Stock Price by Month",
       x = "Month",
       y = "Average Price") +
  theme_minimal() +
  theme(legend.position = "top")

We have write a function detecting anomaly in the plots. Anomaly is detected in date, if the value is less or grater than the mean (data) +- 3 * stdev(data). There are 5 example stock prices plots to detect anomalies. None of them included any anomaly values (Rest of them are also controlled). This is also a good sign for us to create sense models.

plot_anomaly <- function(df){
  anomaly_threshold_price <-  mean(df$price) - 3 * sqrt(var(df$price)) # Adjust this threshold as needed
  anomaly_values_price <- df$price < anomaly_threshold_price
  
  stock_title <- paste(df$short_name[1],"Stock Prices Over Time with Anomalies", sep = " ")
  
  ggplot(df, aes(x = timestamp, y = price)) +
    geom_line() +
    geom_point(data = df[anomaly_values_price, ], aes(color = "Anomaly"), size = 3) +
    labs(x = "Datetime", y = "Stock Closing Prices", title = stock_title) +
    scale_color_manual(values = c("Anomaly" = "red")) +
    theme_minimal()
}
plot_anomaly(thyao_data)

plot_anomaly(akbnk_data)

plot_anomaly(vestl_data)

plot_anomaly(froto_data)

plot_anomaly(sise_data)

Finally, using quantmod library and its functionalities lets look at the visualizations of example stocks information such as THYAO.IS and SISE.IS. These candlestick and line plots are helpful for further analysis also since we get insight how current volume affects further stock price etc.

Also, all stock prices are plotted as single plot in below. It can be seen that some of the stocks have multiplied prices accross many and we think that the splitting dataset as different stocks might be good idea to block any negative effects.

# Plot candlestick chart for one stock (e.g., THYAO, SISE)
thYaoData <- getSymbols("THYAO.IS", src = "yahoo", from = "2018-01-02", to = "2023-11-22", auto.assign = FALSE)
candleChart(thYaoData, theme = "white")

siseData <- getSymbols("SISE.IS", src = "yahoo", from = "2018-01-02", to = "2023-11-22", auto.assign = FALSE)
candleChart(siseData, theme = "white")

ggplot(all_stock_data, aes(x = Date, y = Close, color = Symbol)) +
  geom_line() +
  labs(title = "Closing Prices Over Time for Selected Stocks",
       x = "Date",
       y = "Closing Price",
       color = "Stock Symbol") +
  theme_minimal()

Let’s also analyze the data by applying clustering methods. It will help us to see which stocks are similar to each other in terms of distance.

Clustering Analysis

Clustering is the task of dividing the population or data points into a number of groups such that data points in the same groups are more similar to other data points in the same group and dissimilar to the data points in other groups. It is basically a collection of objects on the basis of similarity and dissimilarity between them.

In this dataset, we can obtain insights about which stocks are similar etc.

Let’s start with k means clustering :

cluster_data <- final_data

features <- cluster_data[, c("volume", "forecast_ma_6", "price")]

# Scale the features
scaled_features <- scale(features)

# Choose the number of clusters (k)
k <- 4

# Apply k-means clustering
kmeans_result <- kmeans(scaled_features, centers = k, nstart = 25)

# Add the cluster labels to your data
cluster_data$cluster <- as.factor(kmeans_result$cluster)

head(cluster_data)
##             timestamp short_name year month day hour   volume forecast_ma_6
## 1 2018-01-02 09:00:00      AKBNK 2018     1   2    9 22346109        6.9475
## 2 2018-01-02 10:00:00      AKBNK 2018     1   2   10 22346109        7.0602
## 3 2018-01-02 11:00:00      AKBNK 2018     1   2   11 22346109        7.0954
## 4 2018-01-02 12:00:00      AKBNK 2018     1   2   12 22346109        7.0814
## 5 2018-01-02 13:00:00      AKBNK 2018     1   2   13 22346109        7.1024
## 6 2018-01-02 14:00:00      AKBNK 2018     1   2   14 22346109        7.1377
##    price cluster
## 1 6.9475       3
## 2 7.0602       3
## 3 7.0954       3
## 4 7.0814       3
## 5 7.1024       3
## 6 7.1377       3
# Scatter plot in 3D
plot_ly(cluster_data, x = ~volume, y = ~`forecast_ma_6`, z = ~price, color = ~cluster,
        text = ~short_name,  # This adds stock names as labels
        type = "scatter3d", mode = "markers", marker = list(size = 5)) %>%
  layout(scene = list(xaxis = list(title = "Volume"),
                      yaxis = list(title = "forecast_ma_6"),
                      zaxis = list(title = "Price"),
                      margin = list(l = 0, r = 0, b = 0, t = 0)))
fviz_cluster(kmeans_result, data = cluster_data[, c("volume", "price")])

set.seed(10)  # Set seed for reproducibility
sample_size <- 40  # Adjust the sample size as needed
subset_data <- cluster_data[sample(1:nrow(cluster_data), sample_size), 
                            c("short_name","volume", "forecast_ma_6", "price")]
normalized_data <- scale(subset_data[, -1])

distance_matrix <- dist(normalized_data, method = "euclidean")
hierarchical_cluster <- hclust(distance_matrix, method = "average")
clusters <- cutree(hierarchical_cluster, k = 5)  # You can adjust the number of clusters (k) as needed,
dend <- as.dendrogram(hierarchical_cluster)

# Plot the hierarchical clustering with factoextra
fviz_dend(dend, k = 5, main = "Hierarchical Clustering Dendrogram")

It can be seen that some stocks are stored in same clusters with minimum distances:

  • VKBANK and HLKBNK

  • THYAO, SISE, SASA, EREGL

  • ASELS, TUPRS, YKBANK, AKBANK

These stocks can be used to predict the price trend of each other.

We are ready to move on with the modelling part of the report.

Machine Learning / Data Mining Models

We have completed the preprocessing, feature engineering, cleaning and the overall descriptive analysis part of the work. Now, we are going to use different kind of machine learning and statistical models to predict the future forecasts well.

We plan to implement 4 main models in our work :

Note that our main metric will be Weighted Mean Absolute Percentage Error (WMAPE) which is a measure used to evaluate the accuracy of forecast models. This metric provides a normalized error rate across all predictions, allowing for a fair comparison between different forecasting models or different data sets.

In this report, we will be analyzing a stock to select the best model for predictions. But in our work, we test with multiple stocks to get balanced information. We are going to use “THYAO” stock as a sample stock.

Let’s start with the Decision Tree Algorithm.

Decision Tree Algorithm

In order not to mutate the original data, we copy it and split it into training and test dataset which will be used in modeling and testing respectively.

dt_thyao_data <- thyao_data

validationIndex <- createDataPartition(dt_thyao_data$price, p=0.70, list=FALSE)

dt_train <- dt_thyao_data[validationIndex,] # 70% of data to training
dt_test <- dt_thyao_data[-validationIndex,]

cat("Dimension of the main dataset:",dim(dt_thyao_data))
## Dimension of the main dataset: 14632 9
cat("Dimension of the train dataset:",dim(dt_train))
## Dimension of the train dataset: 10244 9
cat("Dimension of the test dataset:",dim(dt_test))
## Dimension of the test dataset: 4388 9

Dataset is ready to be learned and now we are going to implement a manual cross validation to tune the hyperparameters for the decision tree and we are going to look at its MSE value in each iteration.

Assuming the tuning hyperparameter will be only the minimal number of observations per tree leaf. Setting complexity parameter to zero and minimum number of observations to split as the twice as the minimal number of observations per tree leaf.

In the end, we are going to store them in df to find the best model.

cv_results <- data.frame(minbucket = numeric(), RMSE = numeric())

set.seed(10)

# Define a range of minsplit values to try
minbucket_values <- c(1, 5, 10, 15, 20) 

# Perform cross-validation for each minbucket value
for (minbucket_val in minbucket_values) {
  # Create a decision tree regression model with the current minbucket value
  tree_model <- rpart(price~.,dt_train,method='anova',
                      control=rpart.control(cp=0, minbucket = minbucket_val, minsplit = 2*minbucket_val))
  
  predictions <- predict(tree_model, newdata = dt_train)
  
  true_values <- dt_train$price
  
  rmse <- round(sqrt(mean((predictions - true_values)^2)),3)
  
  # Store the results in the data frame
  cv_results <- rbind(cv_results, data.frame(minbucket = minbucket_val, RMSE = rmse))
}

ggplot(cv_results, aes(x = minbucket, y = RMSE)) +
  geom_line() +
  geom_point() +
  labs(title = "RMSE vs. MinBucket",
       x = "minbucket",
       y = "RMSE") +
  theme_minimal()

Least MSE belongs to the minbucket = 1 but the this model poorly performs in the test dataset. It seems that overfitting occurs. On the other hand, minbucket = 5 performs well both in training and test dataset. Looks like the model with minbucket = 5 is the best model in terms of RMSE.

In below, we have created the best model with corresponding hyperparameters and some visualizations are plotted.

minbucket_val <- 5
best_tree_model <- rpart(price~.,dt_train,method='anova',
                         control=rpart.control(cp=0, minbucket = minbucket_val, minsplit = 2*minbucket_val))

fancyRpartPlot(best_tree_model)

barplot(best_tree_model$variable.importance,horiz=F)

Lets look at its other regression metrics such as MAE, RMSE, MAPE etc. Also predicted vs. actual plot can be seen:

# Make predictions on the test set
predictions <- predict(best_tree_model, newdata = dt_test)
# True values from the test set
true_values <- dt_test$price

# Calculate Mean Absolute Error (MAE)
mae <- mean(abs(predictions - true_values))
print(paste("Mean Absolute Error (MAE):", mae))
## [1] "Mean Absolute Error (MAE): 0.465661554419702"
# Calculate Root Mean Squared Error (RMSE)
rmse <- round(sqrt(mean((predictions - true_values)^2)),3)
print(paste("Root Mean Squared Error (RMSE):", rmse))
## [1] "Root Mean Squared Error (RMSE): 1.117"
# Calculate Mean Absolute Percentage Error (MAPE)
mape <- mean(abs((predictions - true_values) / true_values) * 100, na.rm = TRUE)
cat("Mean Absolute Percentage Error (MAPE):", mape, "\n")
## Mean Absolute Percentage Error (MAPE): 0.9577984
calculate_wmape <- function(actual, forecasted) {
  n <- length(actual)
  wmape <- (sum(abs(actual - forecasted)) / sum(abs(actual))) * 100
  return(wmape)
}

# Calculate Weighted Mean Absolute Percentage Error (WMAPE)
wmape <- calculate_wmape(predictions, true_values)
print(paste("Weighted Mean Absolute Percentage Error (WMAPE):", wmape))
## [1] "Weighted Mean Absolute Percentage Error (WMAPE): 0.982384375876732"
# Calculate R-squared (R²)
sse <- sum((predictions - true_values)^2)
sst <- sum((true_values - mean(true_values))^2)
rsquared <- 1 - (sse / sst)
print(paste("R-squared (R²):", rsquared))
## [1] "R-squared (R²): 0.999699836261467"
# Create a scatterplot
plot(true_values, predictions, 
     main = "Predictions vs Actual Data",
     xlab = "True Values",
     ylab = "Predicted Values",
     pch = 16,  # Set the point character
     col = "blue"  # Set the point color
)

# Add a diagonal line for reference
abline(0, 1, col = "red", lty = 2)

# Add a legend
legend("topright", legend = "Diagonal Line (Reference)", col = "red", lty = 2, cex = 0.8)

It is extremely successful with the test dataset also, but it might get overfit by the complex model since Decision Trees tend to overfit easily.

Let’s move on with Random Forest.

Random Forest Algorithm

In order not to mutate the original data, we copy it and split it into training and test dataset which will be used in modeling and testing respectively.

rf_thyao_data <- thyao_data
validationIndex <- createDataPartition(rf_thyao_data$price, p=0.70, list=FALSE)

rf_train <- rf_thyao_data[validationIndex,] # 70% of data to training
rf_test <- rf_thyao_data[-validationIndex,]

cat("Dimension of the main dataset:",dim(rf_thyao_data))
## Dimension of the main dataset: 14632 9
cat("Dimension of the train dataset:",dim(rf_train))
## Dimension of the train dataset: 10244 9
cat("Dimension of the test dataset:",dim(rf_test))
## Dimension of the test dataset: 4388 9

Dataset is ready to be learned and now we are going to implement a manual cross validation to tune the hyperparameters for the random forest and we are going to look at its error value in each iteration.

Assuming the tuning hyperparameter will be only the effect of the ratio of the number of features evaluated at each split (mtry) and setting other parameters as J=500 and the minimal number of observations per tree leaf=5.

In the end, we are going to store them in table to find the best model.

set.seed(10)
# Set the number of trees and nodesize
num_trees <- 500
min_obs_per_leaf <- 5

# Create a grid of mtry values to explore
mtry_values <- c(2, 4, 6, 8, 10)  # Add more values as needed

# Create an empty data frame to store results
rf_results <- data.frame(mtry = numeric(), RMSE = numeric())

# Perform grid search
for (m in mtry_values) {
  # Train the Random Forest model
  rf_model <- randomForest(price ~ ., data = rf_train, 
                           ntree = num_trees, nodesize = min_obs_per_leaf, mtry = m)
  
  # Make predictions on the training set
  predictions <- predict(rf_model, newdata = rf_train)
  
  true_values <- rf_train$price
  
  rmse <- round(sqrt(mean((predictions - true_values)^2)),3)
  
  # Store the results in the data frame
  rf_results <- rbind(rf_results, data.frame(mtry = m, RMSE = rmse))
}

# Print the results
print(rf_results)
##   mtry  RMSE
## 1    2 0.953
## 2    4 0.374
## 3    6 0.378
## 4    8 0.391
## 5   10 0.388
ggplot(rf_results, aes(x = mtry, y = RMSE)) +
  geom_line() +
  geom_point() +
  labs(title = "RMSE vs. mtry",
       x = "mtry",
       y = "RMSE") +
  theme_minimal()

Looks like the model converges when mtry = 4 (given ntree = 500 & nodesize = 5) is the best model in terms of RMSE, lets also analyze the other metrics using test dataset.

In below, we have created the best model with corresponding hyperparameters and metrics with test data are listed.

set.seed(10)

best_rf_model <- randomForest(price ~ ., data = rf_train, 
                              ntree = num_trees, nodesize = min_obs_per_leaf, mtry = 4)

# Make predictions on the test set
predictions <- predict(best_rf_model, newdata = rf_test)

# True values from the test set
true_values <- rf_test$price

# Calculate Mean Absolute Error (MAE)
mae <- mean(abs(predictions - true_values))
print(paste("Mean Absolute Error (MAE):", mae))
## [1] "Mean Absolute Error (MAE): 0.351266339373101"
# Calculate Root Mean Squared Error (RMSE)
rmse <- round(sqrt(mean((predictions - true_values)^2)),3)
print(paste("Root Mean Squared Error (RMSE):", rmse))
## [1] "Root Mean Squared Error (RMSE): 0.844"
# Calculate Mean Absolute Percentage Error (MAPE)
mape <- mean(abs((predictions - true_values) / true_values) * 100, na.rm = TRUE)
cat("Mean Absolute Percentage Error (MAPE):", mape, "\n")
## Mean Absolute Percentage Error (MAPE): 0.6816682
# Calculate Weighted Mean Absolute Percentage Error (WMAPE)
wmape <- calculate_wmape(predictions, true_values)
print(paste("Weighted Mean Absolute Percentage Error (WMAPE):", wmape))
## [1] "Weighted Mean Absolute Percentage Error (WMAPE): 0.728230833684908"
# Calculate R-squared (R²)
sse <- sum((predictions - true_values)^2)
sst <- sum((true_values - mean(true_values))^2)
rsquared <- 1 - (sse / sst)
print(paste("R-squared (R²):", rsquared))
## [1] "R-squared (R²): 0.999833135247456"

It can be seen that random forest performed also very well on this dataset. Moreover, the possibility for overfitting the random forest model is much less than the decision trees since it is using bootstrap aggregating technique.

Let’s also make some visualizations on the grey-box random forest model.

# Plot variable importance
varImpPlot(best_rf_model)

# Create partial dependence plot
partial_plot <- partial(best_rf_model, pred.var = 'year', data = dt_test)

# Plot the partial dependence plot
plot(partial_plot)

Let’s move on with the third algorithm which is ARIMA.

ARIMA (AutoRegressive Integrated Moving Average)

In order not to mutate the original data, we copy it and split it into training and test dataset which will be used in modeling and testing respectively.

arima_thyao_data <- thyao_data
validationIndex <- createDataPartition(arima_thyao_data$price, p=0.70, list=FALSE)

arima_train <- arima_thyao_data[validationIndex,] # 70% of data to training
arima_test <- arima_thyao_data[-validationIndex,]

cat("Dimension of the main dataset:",dim(arima_thyao_data))
## Dimension of the main dataset: 14632 9
cat("Dimension of the train dataset:",dim(arima_train))
## Dimension of the train dataset: 10244 9
cat("Dimension of the test dataset:",dim(arima_test))
## Dimension of the test dataset: 4388 9

Let’s prepare the timeseriers data as 10-hours frequency, fit the ARIMA model using auto.arima function and plot.

# frequency is 10 since we have 10-hours data for each stock and day.
ts_data <- ts(arima_train$price, frequency = 10)
# plotting time-series data with prices
plot(ts_data, main = "Time Series Plot")

diff_ts_data <- diff(ts_data)
# fitting arima model
arima_model <- auto.arima(ts_data)

checkresiduals(arima_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,2,0)(1,0,2)[10]
## Q* = 565.95, df = 12, p-value < 2.2e-16
## 
## Model df: 8.   Total lags used: 20
summary(arima_model)
## Series: ts_data 
## ARIMA(5,2,0)(1,0,2)[10] 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5    sar1     sma1     sma2
##       -0.8420  -0.6614  -0.4797  -0.3114  -0.1497  0.6262  -0.6190  -0.0622
## s.e.   0.0098   0.0125   0.0133   0.0125   0.0098  0.0770   0.0769   0.0104
## 
## sigma^2 = 0.8012:  log likelihood = -13394.26
## AIC=26806.53   AICc=26806.55   BIC=26871.64
## 
## Training set error measures:
##                        ME      RMSE       MAE          MPE     MAPE      MASE
## Training set 0.0001983816 0.8946578 0.3611751 0.0001124789 0.711001 0.3000976
##                     ACF1
## Training set -0.02200168

Let’s now forecast further along with number of test records.

forecast_values <- forecast(arima_model, h = nrow(arima_test))  
plot(forecast_values, main = "ARIMA Forecast")

combined_data <- data.frame(
  timestamp = arima_test$timestamp,
  actual = arima_test$price,
  forecast_mean = forecast_values$mean,
  forecast_lower_80 = forecast_values$lower[, "80%"],
  forecast_upper_80 = forecast_values$upper[, "80%"],
  forecast_lower_95 = forecast_values$lower[, "95%"],
  forecast_upper_95 = forecast_values$upper[, "95%"]
)

ggplot(combined_data, aes(x = timestamp)) +
  geom_line(aes(y = actual, color = "Actual"), linetype = "solid", size = 1, alpha = 0.8) +
  geom_line(aes(y = forecast_mean, color = "Forecast"), linetype = "solid", size = 1, alpha = 0.8) +
  geom_ribbon(aes(ymin = forecast_lower_80, ymax = forecast_upper_80, fill = "80% CI"), alpha = 0.3) +
  geom_ribbon(aes(ymin = forecast_lower_95, ymax = forecast_upper_95, fill = "95% CI"), alpha = 0.3) +
  labs(title = "ARIMA Forecast vs Actual",
       x = "Timestamp",
       y = "Price") +
  scale_color_manual(values = c("Actual" = "blue", "Forecast" = "red")) +
  scale_fill_manual(values = c("80% CI" = "pink", "95% CI" = "orange")) +
  theme_minimal() +
  theme(legend.position = "top")

Finally, let’s look at the metrics of ARIMA model:

mae <- mean(abs(forecast_values$mean - arima_test$price))
mse <- mean((forecast_values$mean - arima_test$price)^2)
rmse <- sqrt(mse)
mape <- mean(abs((forecast_values$mean - arima_test$price) / arima_test$price) * 100, na.rm = TRUE)
wmape <- calculate_wmape(forecast_values$mean, arima_test$price)
cat("MAE:", mae, "\n")
## MAE: 1615.511
cat("MSE:", mse, "\n")
## MSE: 3195873
cat("RMSE:", rmse, "\n")
## RMSE: 1787.701
cat("MAPE:", mape, "\n")
## MAPE: 7327.988
cat("WMAPE:", wmape, "\n")
## WMAPE: 97.15201

Results are not as good as Decision Tree or Random Forest but it does not create complex models even they are simple. So, it might be more beneficial with unseen data when it comes.

Finally, let’s also model with Gradient Boosting Machine Algorithm (GBM).

GBM Algorithm

In order not to mutate the original data, we copy it and split it into training and test dataset which will be used in modeling and testing respectively. Since GBM could not handle POSIX and single type factor data types, we are removing them from the data. It might result with the lack of info but also less complex model. Let’s analyze it.

thyao_data_gbm <- thyao_data

thyao_data_gbm <- thyao_data_gbm[, !colnames(thyao_data_gbm) %in% c("short_name","timestamp")]

validationIndex <- createDataPartition(thyao_data_gbm$price, p=0.70, list=FALSE)

gbm_train <- thyao_data_gbm[validationIndex,] # 70% of data to training
gbm_test <- thyao_data_gbm[-validationIndex,]

cat("Dimension of the main dataset:",dim(thyao_data_gbm))
## Dimension of the main dataset: 14632 7
cat("Dimension of the train dataset:",dim(gbm_train))
## Dimension of the train dataset: 10244 7
cat("Dimension of the test dataset:",dim(gbm_test))
## Dimension of the test dataset: 4388 7

In above, there are 2 train & test pairs occur. One will be used in the cross validation and the other will be used to test the best model with tuned parameters.

Let’s tune the hyperparameters of the GBM with respect to the “RMSE” value.

We are mainly interested in tuning the depth (interaction.depth), the number of trees (n.trees) and the learning rate (shrinkage).

set.seed(10)

n_folds=10

fitControl=trainControl(method = "cv",
                        number = n_folds)
## gradient boosting
gbmGrid=expand.grid(interaction.depth = c(3, 5), 
                    n.trees = c(1:5)*100, 
                    shrinkage = c(0.05,0.1),
                    n.minobsinnode = 10)
set.seed(1)                        
gbm_fit=train(price ~ ., data = gbm_train, 
              method = "gbm", 
              trControl = fitControl, metric='RMSE',
              tuneGrid = gbmGrid,
              verbose=F) #verbose is an argument from gbm, prints to screen
gbm_fit
## Stochastic Gradient Boosting 
## 
## 10244 samples
##     6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 9219, 9220, 9220, 9219, 9218, 9220, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  RMSE      Rsquared   MAE      
##   0.05       3                  100      1.341837  0.9996482  0.7193699
##   0.05       3                  200      1.087147  0.9997163  0.4937985
##   0.05       3                  300      1.072926  0.9997240  0.4866959
##   0.05       3                  400      1.062060  0.9997293  0.4823605
##   0.05       3                  500      1.052774  0.9997341  0.4784723
##   0.05       5                  100      1.207664  0.9996989  0.6220919
##   0.05       5                  200      1.056619  0.9997325  0.4753847
##   0.05       5                  300      1.034348  0.9997434  0.4661974
##   0.05       5                  400      1.021328  0.9997499  0.4621954
##   0.05       5                  500      1.015679  0.9997526  0.4602645
##   0.10       3                  100      1.185813  0.9996626  0.5662271
##   0.10       3                  200      1.138657  0.9996889  0.5428437
##   0.10       3                  300      1.112392  0.9997031  0.5309708
##   0.10       3                  400      1.094330  0.9997129  0.5226162
##   0.10       3                  500      1.083607  0.9997185  0.5176169
##   0.10       5                  100      1.101273  0.9997100  0.5149030
##   0.10       5                  200      1.061502  0.9997297  0.4978598
##   0.10       5                  300      1.038812  0.9997412  0.4881850
##   0.10       5                  400      1.035150  0.9997429  0.4855091
##   0.10       5                  500      1.027340  0.9997467  0.4820569
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 500, interaction.depth =
##  5, shrinkage = 0.05 and n.minobsinnode = 10.
plot(gbm_fit)

Selecting the best model is below:

Looks like the best GBM model with the parameters (Shrinkage = 0.1, depth = 5, n.trees = 500).

Let’s analyze this model with test data finally.

noftrees=500
depth=5
learning_rate=0.1

boosting_model=gbm(price~., data=gbm_train,distribution = 'gaussian', n.trees = noftrees,
                   interaction.depth = depth, n.minobsinnode = 10, shrinkage =learning_rate, cv.folds = 10)

gbm.perf(boosting_model, method = "cv")

## [1] 495
predictions <- predict(boosting_model, newdata = gbm_test, type = "response")

# True values from the test set
true_values <- gbm_test$price

# Calculate Mean Absolute Error (MAE)
mae <- mean(abs(predictions - true_values))
print(paste("Mean Absolute Error (MAE):", mae))
## [1] "Mean Absolute Error (MAE): 0.500087202401637"
# Calculate Root Mean Squared Error (RMSE)
rmse <- round(sqrt(mean((predictions - true_values)^2)),3)
print(paste("Root Mean Squared Error (RMSE):", rmse))
## [1] "Root Mean Squared Error (RMSE): 1.069"
# Calculate Mean Absolute Percentage Error (MAPE)
mape <- mean(abs((predictions - true_values) / true_values) * 100, na.rm = TRUE)
cat("Mean Absolute Percentage Error (MAPE):", mape, "\n")
## Mean Absolute Percentage Error (MAPE): 1.11402
# Calculate Weighted Mean Absolute Percentage Error (WMAPE)
wmape <- calculate_wmape(predictions, true_values)
print(paste("Weighted Mean Absolute Percentage Error (WMAPE):", wmape))
## [1] "Weighted Mean Absolute Percentage Error (WMAPE): 1.04888531535607"

It has better results than ARIMA but worse than Decision Tree and Random Forest. But still the results are satisfied eventhough the problem is complex and the information we have is limited.

Anyway we need unseen data to select the best model for predictions.

Predictions with Selected Model for All Stocks

Assuming that GBM is the selected model and lets do the predictions on the created new data

# List of stock names
stock_names <- c("THYAO", "AKBNK", "ARCLK", "ASELS", "BIMAS", "DOHOL", "EKGYO", "EREGL", "FROTO", "GUBRF",
                 "GARAN", "KRDMD", "KCHOL", "KOZAL", "KOZAA", "PGSUS", "PETKM", "SAHOL", "SASA", "SISE",
                 "TAVHL", "TKFEN", "TUPRS", "TTKOM", "TCELL", "HALKB", "ISCTR", "VAKBN", "VESTL", "YKBNK")

# Function to generate new data for each stock
generate_new_data <- function(stock_name) {
  last_timestamp <- max(final_data$timestamp[final_data$short_name == stock_name])
  last_datetime <- as.POSIXct(last_timestamp)
  
  # Increase last_datetime by one day
  last_datetime <- last_datetime + days(1)
  
  # Extract date part
  last_date <- format(last_datetime, "%Y-%m-%d")
  
  # Combine with 09:00
  start_datetime <- as.POSIXct(paste(last_date, "09:00:00"))
  
  # Generate new timestamps starting from the next day of the last record, between 9:00 and 18:00
  new_timestamps <- seq.POSIXt(from = start_datetime, 
                               by = "1 hour", length.out = 10)
  
  # Extract year, month, day, and hour
  new_data <- data.frame(
    timestamp = new_timestamps,
    short_name = rep(stock_name, 10),
    year = format(new_timestamps, "%Y"),
    month = format(new_timestamps, "%m"),
    day = format(new_timestamps, "%d"),
    hour = format(new_timestamps, "%H"),
    volume = rep(71142665, 10),
    forecast_ma_6 = rep(NA, 10),
    price = rep(NA, 10)
  )
  
  return(new_data)
}

# List to store new data for each stock
new_data_list <- list()

# Generate new data for each stock
for (stock_name in stock_names) {
  new_data <- generate_new_data(stock_name)
  new_data_list[[length(new_data_list) + 1]] <- new_data
}

# Combine new data for all stocks into a single dataframe
new_data_df <- do.call(rbind, new_data_list)

head(new_data_df,10)
##              timestamp short_name year month day hour   volume forecast_ma_6
## 1  2023-11-22 09:00:00      THYAO 2023    11  22   09 71142665            NA
## 2  2023-11-22 10:00:00      THYAO 2023    11  22   10 71142665            NA
## 3  2023-11-22 11:00:00      THYAO 2023    11  22   11 71142665            NA
## 4  2023-11-22 12:00:00      THYAO 2023    11  22   12 71142665            NA
## 5  2023-11-22 13:00:00      THYAO 2023    11  22   13 71142665            NA
## 6  2023-11-22 14:00:00      THYAO 2023    11  22   14 71142665            NA
## 7  2023-11-22 15:00:00      THYAO 2023    11  22   15 71142665            NA
## 8  2023-11-22 16:00:00      THYAO 2023    11  22   16 71142665            NA
## 9  2023-11-22 17:00:00      THYAO 2023    11  22   17 71142665            NA
## 10 2023-11-22 18:00:00      THYAO 2023    11  22   18 71142665            NA
##    price
## 1     NA
## 2     NA
## 3     NA
## 4     NA
## 5     NA
## 6     NA
## 7     NA
## 8     NA
## 9     NA
## 10    NA

Filling the forecast_ma_6 of new_data column with using the previous data for each stock.

# Function to fill forecast_ma_6 column for each stock in new_data_df
fill_forecast_ma_6 <- function(stock_name, new_data_df, original_data) {
  # Filter original data for the specific stock
  stock_data <- original_data[original_data$short_name == stock_name, ]
  
  # Set initial values for the moving average calculation
  window_size <- 6
  forecast_column <- "forecast_ma_6"
  
  # Iterate over rows in new_data_df for the current stock
  for (i in which(new_data_df$short_name == stock_name)) {
    # Extract relevant information from new_data_df
    timestamp <- new_data_df$timestamp[i]
    
    # Filter previous records from stock_data for the current stock
    previous_records <- stock_data[stock_data$timestamp < timestamp, ]
    
    # Calculate moving average forecast
    if (nrow(previous_records) >= window_size) {
      # Use mean to calculate the average iteratively
      new_data_df[i, forecast_column] <- mean(tail(previous_records[, forecast_column], window_size))
      
      # Append a new record to stock_data
      new_record <- tail(previous_records, 1)
      new_record$timestamp <- timestamp
      new_record[, forecast_column] <- new_data_df[i, forecast_column]
      stock_data <- rbind(stock_data, new_record)
    }
  }
  
  return(new_data_df)
}
# Iterate over each stock to fill forecast_ma_6 in new_data_df
for (stock_name in stock_names) {
  new_data_df <- fill_forecast_ma_6(stock_name, new_data_df, final_data)
}

head(new_data_df,10)
##              timestamp short_name year month day hour   volume forecast_ma_6
## 1  2023-11-22 09:00:00      THYAO 2023    11  22   09 71142665      255.9722
## 2  2023-11-22 10:00:00      THYAO 2023    11  22   10 71142665      255.8981
## 3  2023-11-22 11:00:00      THYAO 2023    11  22   11 71142665      255.8673
## 4  2023-11-22 12:00:00      THYAO 2023    11  22   12 71142665      255.8521
## 5  2023-11-22 13:00:00      THYAO 2023    11  22   13 71142665      255.8900
## 6  2023-11-22 14:00:00      THYAO 2023    11  22   14 71142665      255.9202
## 7  2023-11-22 15:00:00      THYAO 2023    11  22   15 71142665      255.9000
## 8  2023-11-22 16:00:00      THYAO 2023    11  22   16 71142665      255.8880
## 9  2023-11-22 17:00:00      THYAO 2023    11  22   17 71142665      255.8863
## 10 2023-11-22 18:00:00      THYAO 2023    11  22   18 71142665      255.8894
##    price
## 1     NA
## 2     NA
## 3     NA
## 4     NA
## 5     NA
## 6     NA
## 7     NA
## 8     NA
## 9     NA
## 10    NA

Now, we have the data let’s do the predictions for all 30 stocks.

noftrees=500
depth=5
learning_rate=0.1
# Function to train random forest model and calculate performance metrics
train_and_evaluate_rf_model <- function(stock_data) {
  stock_name <- unique(stock_data$short_name)
  
  stock_data <- stock_data[, !colnames(stock_data) %in% c("short_name","timestamp")]

  validationIndex <- createDataPartition(stock_data$price, p=0.70, list=FALSE)
  
  gbm_train <- stock_data[validationIndex,] # 70% of data to training
  gbm_test <- stock_data[-validationIndex,]
  
  #ts_data <- ts(stock_data$price, frequency = 10)

  #arima_model <- auto.arima(ts_data)
  
  #predictions <- forecast(arima_model, h = nrow(rf_test))
  
  # Train GBM model
  boosting_model=gbm(price~., data=stock_data,distribution = 'gaussian', n.trees = noftrees,
                   interaction.depth = depth, n.minobsinnode = 10, shrinkage =learning_rate, cv.folds = 10)
  
  # Make predictions on test data
  predictions <- predict(boosting_model, newdata = gbm_test, type = "response")
  
  # Calculate performance metrics
  mae <- mae(predictions, gbm_test$price)
  mape <- mape(predictions, gbm_test$price)
  rmse <- rmse(predictions, gbm_test$price)
  wmape <- calculate_wmape(gbm_test$price, predictions)
  #mae <- mae(predictions$mean, rf_test$price)
  #mape <- mape(predictions$mean, rf_test$price)
  #rmse <- rmse(predictions$mean, rf_test$price)
  #wmape <- calculate_wmape(rf_test$price, predictions$mean)
  
  # Create a data frame to store performance metrics
  performance_df <- data.frame(StockName = stock_name, MAE = mae, RMSE = rmse, MAPE = mape, WMAPE = wmape)
  
  further_data <- new_data_df%>% filter(short_name == stock_name)
  # Predictions for further day
  further_predictions <- predict(boosting_model, newdata = further_data, type = "response")
  #further_predictions <- forecast(arima_model, h = nrow(further_data))

  assign(paste0("pred_", stock_name), further_predictions, envir = .GlobalEnv)
  
  return(performance_df)
}

# List to store performance metrics for each stock
performance_list <- list()

# List of stock data tables
stock_data_list <- list(thyao_data, akbnk_data, arclk_data, asels_data, bimas_data,
                        dohol_data, ekgyo_data, eregl_data, froto_data, gubrf_data,
                        garan_data, krdmd_data, kchol_data, kozal_data, kozaa_data,
                        pgsus_data, petkm_data, sahol_data, sasa_data, sise_data,
                        tahvl_data, tkfen_data, tuprs_data, ttkom_data, tcell_data,
                        halkb_data, isctr_data, vakbn_data, vestl_data, ykbnk_data)

# Train, evaluate, and store performance metrics for each stock
for (stock_data in stock_data_list) {
  performance_df <- train_and_evaluate_rf_model(stock_data)
  performance_list[[length(performance_list) + 1]] <- performance_df
}

# Combine performance metrics for all stocks into a single dataframe
all_performance_df <- do.call(rbind, performance_list)

all_performance_df
##    StockName        MAE       RMSE        MAPE     WMAPE
## 1      THYAO 0.39033646 0.75904457 0.009982112 0.8139060
## 2      AKBNK 0.08059697 0.13488143 0.009167021 0.9117055
## 3      ARCLK 0.38321889 0.67910588 0.008885492 0.8606178
## 4      ASELS 0.10692528 0.18013370 0.008836408 0.8802356
## 5      BIMAS 0.57445391 0.91766265 0.007718242 0.7454161
## 6      DOHOL 0.03603303 0.06228549 0.011078559 1.0155012
## 7      EKGYO 0.02590905 0.04404941 0.009360896 0.9023213
## 8      EREGL 0.13771721 0.22911102 0.008618360 0.8050142
## 9      FROTO 1.79307637 3.09861116 0.010588184 0.8699154
## 10     GUBRF 0.86666716 1.63972262 0.018342353 1.1172795
## 11     GARAN 0.12401965 0.21992172 0.009242295 0.9184567
## 12     KRDMD 0.08415955 0.14402922 0.011184162 1.0510681
## 13     KCHOL 0.29442832 0.50421826 0.008869320 0.8412946
## 14     KOZAL 0.07686525 0.15002164 0.010800054 1.0476330
## 15     KOZAA 0.22189492 0.36449483 0.011309696 1.0971747
## 16     PGSUS 1.70036439 3.12817178 0.013477721 1.0318505
## 17     PETKM 0.06436480 0.10886435 0.008765779 0.8765785
## 18     SAHOL 0.13014812 0.22278178 0.008352014 0.7961255
## 19      SASA 0.15964577 0.34284759 0.016878359 1.1615467
## 20      SISE 0.11922530 0.21145367 0.009104184 0.8113670
## 21     TAVHL 0.34857574 0.55199030 0.009526151 0.8917080
## 22     TKFEN 0.18895442 0.28918296 0.009133999 0.9323379
## 23     TUPRS 0.25018452 0.44317772 0.008769060 0.8326604
## 24     TTKOM 0.08010064 0.12959552 0.009650019 0.9414034
## 25     TCELL 0.15407581 0.24467778 0.008097754 0.8233283
## 26     HALKB 0.06268999 0.09925692 0.008456702 0.8695691
## 27     ISCTR 0.04719313 0.08821309 0.009699201 0.9777276
## 28     VAKBN 0.05330411 0.08566975 0.008888249 0.9109929
## 29     VESTL 0.22227962 0.36393234 0.010510001 0.9667567
## 30     YKBNK 0.04235321 0.07495944 0.010253290 0.9894556
ggplot(all_performance_df, aes(x = StockName)) +
  geom_bar(aes(y = MAE), fill = "blue", stat = "identity", position = "dodge") +
  geom_bar(aes(y = RMSE), fill = "green", stat = "identity", position = "dodge") +
  geom_bar(aes(y = MAPE), fill = "orange", stat = "identity", position = "dodge") +
  geom_bar(aes(y = WMAPE), fill = "red", stat = "identity", position = "dodge") +
  labs(title = "Performance Metrics by Stock",
       x = "Stock Name",
       y = "Value") +
  theme_minimal()

# Scatter plot for MAE and RMSE
ggplot(all_performance_df, aes(x = MAE, y = RMSE, color = StockName)) +
  geom_point() +
  labs(title = "Scatter Plot of MAE vs RMSE",
       x = "Mean Absolute Error (MAE)",
       y = "Root Mean Squared Error (RMSE)") +
  theme_minimal()

Stock Prices Submission Generator

Since we are going to submit our predictions through Google Forms as a pre-defined format. We wrote a function that creates this predictions submission format. It iteratively adds prediction (e.g. pred_THYAO, pred_SISE etc.) to a key as stock name in a dictionary.

stock_submission_generator <- function(stock_names) {
  # Create a list to store the predictions
  prediction_list <- list()
  
  # Iterate over each stock name
  for (stock_name in stock_names) {
    # Clean up the stock name to make it a valid R variable name
    cleaned_stock_name <- gsub("[^A-Za-z0-9_]", "_", stock_name)
    
    # Get the predicted values directly (no need for dynamic variable names)
    prediction_values <- get(paste0("pred_", cleaned_stock_name))
    
    # Store the results in the list
    prediction_list[[cleaned_stock_name]] <- prediction_values
  }
  # Combine the results into a named list
    result <- paste0("{", paste(lapply(names(prediction_list), function(stock_name) {
      paste0("'", stock_name, "': [", paste(prediction_list[[stock_name]], collapse = ", "), "]")
    }), collapse = ","), "}")
  
  return(result)
}

# Example usage
stock_names <- c("THYAO", "AKBNK", "ARCLK", "ASELS", "BIMAS", "DOHOL", "EKGYO", "EREGL", "FROTO", "GUBRF",
                 "GARAN", "KRDMD", "KCHOL", "KOZAL", "KOZAA", "PGSUS", "PETKM", "SAHOL", "SASA", "SISE",
                 "TAVHL", "TKFEN", "TUPRS", "TTKOM", "TCELL", "HALKB", "ISCTR", "VAKBN", "VESTL", "YKBNK")

# Call the function with the stock names and their respective predictions
submission_result <- stock_submission_generator(stock_names)

cat(submission_result)
## {'THYAO': [255.969608294983, 256.615663745042, 256.343819164993, 256.286943978973, 256.776961573296, 256.230657068488, 255.405122025276, 255.642850595897, 254.630862322638, 254.446730946689],'AKBNK': [30.5377457053842, 30.6499982152679, 30.6545821101494, 30.6340421657635, 30.6123844938834, 30.5430978536259, 30.5103589599156, 30.4955209189203, 30.4868931155545, 30.4868931155545],'ARCLK': [145.860830687556, 146.321065191659, 146.405986346904, 146.504901303402, 146.9023463766, 145.911320325004, 144.882098524878, 144.416734225873, 143.824825371749, 143.824825371749],'ASELS': [44.4198867173553, 44.4680389942263, 44.4493244132271, 44.3758695552037, 44.3794854848936, 44.310666789637, 44.2650634264843, 44.2524408275254, 44.1420185728259, 44.1939143615832],'BIMAS': [302.395341806553, 302.889211293687, 303.644672088887, 303.922445673488, 305.186732917224, 304.947733760969, 304.068421480413, 303.840402611338, 303.560112769404, 303.541455628894],'DOHOL': [13.5592177312487, 13.654865976692, 13.6639630183457, 13.6386829727241, 13.6192427340316, 13.5959425163554, 13.5848562721602, 13.5936732811178, 13.5784061670936, 13.6062732052344],'EKGYO': [7.70386144751303, 7.71622507398868, 7.7094679021718, 7.69546593991017, 7.6927756458934, 7.67578221775525, 7.66879883932349, 7.66501513308472, 7.66686396092585, 7.66370991777799],'EREGL': [39.9107318358038, 39.9192461697383, 39.8868636764233, 39.916438305072, 39.86014746665, 39.8016840528069, 39.7516323254061, 39.7590806825155, 39.7872541099952, 39.7872541099952],'FROTO': [876.108462123782, 878.354461503061, 879.463255826073, 875.375644104144, 873.904470919492, 873.719097149905, 876.490701867797, 871.49849783629, 871.445732427146, 871.409158268854],'GUBRF': [376.894632808437, 378.329539017715, 377.651837219326, 378.283511265868, 378.418762328101, 379.91511574578, 379.539731116453, 379.076632468205, 378.391929895249, 377.875792865052],'GARAN': [49.1590710160918, 49.24577704682, 49.2374379336387, 49.1960222374377, 49.184675726795, 48.9653008653089, 48.9500160857953, 49.0517229018098, 49.0134389746593, 49.0373418818771],'KRDMD': [24.8881287037912, 24.9203162325567, 24.886690235614, 24.8417434774918, 24.798016761538, 24.7911537433092, 24.7647804751558, 24.72298193628, 24.6626676657213, 24.6786057099739],'KCHOL': [142.199067454281, 142.420073194969, 142.779076776148, 142.801040163372, 143.046867322561, 142.883630702257, 142.367657697834, 141.965608754034, 141.029308415382, 141.095654607244],'KOZAL': [23.3876445676346, 23.4047227623321, 23.4397389577037, 23.4217976767029, 23.3836041288391, 23.3122763187895, 23.2289702862754, 23.2282209715559, 23.2030136574233, 23.1847236025491],'KOZAA': [57.765053450309, 57.7570716994725, 57.7413714059628, 57.6855321239444, 57.7864223142783, 57.816405851658, 56.9858927441064, 56.8650760086098, 56.2024015086388, 55.9931997806777],'PGSUS': [743.977976685583, 741.636780532259, 741.458616013464, 737.653916272413, 737.096401078555, 737.571740812299, 736.419429205944, 737.118862991158, 738.150331742694, 738.150331742694],'PETKM': [22.4268864076383, 22.4299380996524, 22.3750340638284, 22.3254136666963, 22.313467377842, 22.3782379185641, 22.3453874367298, 22.3673316862495, 22.3050103736896, 22.2876960749845],'SAHOL': [60.2541537698305, 60.0595094907906, 60.563367500181, 60.5589445409151, 60.4624420304039, 60.3446231777883, 60.2806098625432, 60.2557572152218, 60.3153063330535, 60.2171722646652],'SASA': [45.1313615212914, 45.1595696932252, 45.2372353062594, 45.2372353062594, 45.2372353062594, 45.1764757288214, 45.1719721704567, 45.1719721704567, 45.1110450525011, 45.1110450525011],'SISE': [49.9703389045124, 50.0893370949699, 50.0427734449927, 50.0043848876834, 49.9644350488058, 49.9685643953651, 49.9455115432557, 49.95851816293, 49.9609979311807, 49.9339682067858],'TAVHL': [124.706128945275, 125.212187825872, 125.578093021028, 125.656942568025, 125.450108797787, 124.996798692821, 124.666144916565, 124.537551209203, 124.346139545153, 124.346139545153],'TKFEN': [44.6272313810065, 45.2607559373176, 45.3102960839512, 45.3371056093738, 45.4780669600067, 45.5735352710021, 45.8431932037062, 46.0308027894361, 45.9353152862123, 45.9249049957839],'TUPRS': [157.372943623826, 158.075405827756, 157.831955712166, 156.879696729547, 156.427362280661, 155.5347904043, 155.067114437495, 154.603353718596, 154.364022131893, 154.293854484729],'TTKOM': [22.6340629148268, 22.6040422364461, 22.5793803628193, 22.5265873961429, 22.4909912816922, 22.450780469113, 22.4168246506498, 22.4260897613999, 22.3978103516239, 22.4112822284847],'TCELL': [57.8685644539419, 57.9884784209217, 57.9172261782847, 57.8577756011559, 57.8697480092047, 57.7645901763819, 57.5412223903733, 57.4185966029327, 57.078829988433, 57.0139731802677],'HALKB': [13.3477762789072, 13.3560477926049, 13.3502295629136, 13.3444915742126, 13.3425200780537, 13.3034696027056, 13.2871871869258, 13.2871871869258, 13.26586686375, 13.26586686375],'ISCTR': [20.5899142732381, 20.5927869788382, 20.6070112529566, 20.6398301883283, 20.6838072382056, 20.6551544084539, 20.6334445955446, 20.6309879344768, 20.6212448505474, 20.6212448505474],'VAKBN': [15.0750054089719, 15.0278762502278, 15.0183741409164, 15.0400904293246, 15.0553594612608, 15.0383963630543, 14.9929424878878, 14.9943584378797, 15.0100707143995, 15.0123548199402],'VESTL': [62.8250070422009, 63.3331015994052, 63.3388704923063, 63.0936994608674, 62.6505349944898, 62.1552377035346, 61.8816053380952, 61.7086557066198, 61.4458397623903, 61.4905837422967],'YKBNK': [17.0138296833178, 17.0168556865153, 17.0192748089297, 17.0019559975133, 16.9996875213411, 16.98079197539, 16.9537840798112, 16.9563228498806, 16.9494606194054, 16.9681808955183]}

Calculating Next Day’s Expected Volume

Since Yahoo Finance Library enables volume information of stocks until day t and we need to predict next days’ stock prices. We have written a function that returns the excepted volume of a specific stock of the next day. It basically, aggregates 50 previous days and returns the calculate of their average volume.

calculate_next_day_expected_volume <- function(stock_name){
  
  test <- final_data %>% filter(short_name == stock_name)
  
  expected_nextday_volume <- round(mean(tail(test$volume, 50)))
  
  day <- tail(test$timestamp,1)
  
  timestamp <- ymd_hms(day, tz = "UTC")
  
  # Increment the date by 1 day and floor it to remove hours
  next_day <- floor_date(timestamp + days(1), unit = "days")
  
  cat("Volume for the stock",stock_name," for the day ",format(next_day, "%Y-%m-%d"),"is :", expected_nextday_volume)
  
}

calculate_next_day_expected_volume("THYAO")
## Volume for the stock THYAO  for the day  2023-11-22 is : 43865916
calculate_next_day_expected_volume("AKBNK")
## Volume for the stock AKBNK  for the day  2023-11-22 is : 68625757
calculate_next_day_expected_volume("ARCLK")
## Volume for the stock ARCLK  for the day  2023-11-22 is : 3404966
calculate_next_day_expected_volume("FROTO")
## Volume for the stock FROTO  for the day  2023-11-22 is : 800496

Predicting Stock Prices with Unseen Data

In this part, we are going to see that how well our selected model predicts the unseen next day’s stock price.

new_data <- data.frame(
  timestamp = seq(as.POSIXct("2023-12-12 09:00:00"), by = "hour", length.out = 5),
  short_name = rep("THYAO", 5),
  year = c(2018, 2018, 2018, 2018, 2018),
  month = c(1, 1, 1, 1, 1),
  day = c(2, 2, 2, 2, 2),
  hour = c(9, 10, 11, 12, 13),
  volume = rep(36869621 , 5),
  forecast_ma_6 = c(260.2, 260.8, 265.2, 262.5, 265),
  price = c(261.5, 262.4, 263.7, 264, 263)
)

new_data$year[1:5] <- 2023
new_data$month[1:5] <- 12
new_data$day[1:5] <- 12
new_data$hour[1:5] <- 9:13


head(new_data)
##             timestamp short_name year month day hour   volume forecast_ma_6
## 1 2023-12-12 09:00:00      THYAO 2023    12  12    9 36869621         260.2
## 2 2023-12-12 10:00:00      THYAO 2023    12  12   10 36869621         260.8
## 3 2023-12-12 11:00:00      THYAO 2023    12  12   11 36869621         265.2
## 4 2023-12-12 12:00:00      THYAO 2023    12  12   12 36869621         262.5
## 5 2023-12-12 13:00:00      THYAO 2023    12  12   13 36869621         265.0
##   price
## 1 261.5
## 2 262.4
## 3 263.7
## 4 264.0
## 5 263.0

With Decision Tree:

test_pred <- predict(best_tree_model, new_data[,-9])
test_pred
##        1        2        3        4        5 
## 257.0944 257.0944 257.0944 257.0944 257.0944
pred_THYAO <- predict(boosting_model, new_data[,-9])
mae <- mean(abs(test_pred - new_data$price))
print(paste("Mean Absolute Error (MAE):", mae))
## [1] "Mean Absolute Error (MAE): 5.82555555555556"
# Calculate Root Mean Squared Error (RMSE)
rmse <- round(sqrt(mean((test_pred - new_data$price)^2)),3)
print(paste("Root Mean Squared Error (RMSE):", rmse))
## [1] "Root Mean Squared Error (RMSE): 5.895"
# Calculate Mean Absolute Percentage Error (MAPE)
mape <- mean(abs((test_pred - new_data$price) / new_data$price) * 100, na.rm = TRUE)
cat("Mean Absolute Percentage Error (MAPE):", mape, "\n")
## Mean Absolute Percentage Error (MAPE): 2.214562
# Calculate Weighted Mean Absolute Percentage Error (WMAPE)
wmape <- calculate_wmape(test_pred, new_data$price)
print(paste("Weighted Mean Absolute Percentage Error (WMAPE):", wmape))
## [1] "Weighted Mean Absolute Percentage Error (WMAPE): 2.26592043563758"

With ARIMA:

forecast_values <- forecast(arima_model, h = nrow(new_data))  
forecast_values$mean
## Time Series:
## Start = c(1025, 5) 
## End = c(1025, 9) 
## Frequency = 10 
## [1] 259.0378 259.8435 260.5037 261.4701 261.8396
mae <- mean(abs(forecast_values$mean - new_data$price))
mse <- mean((forecast_values$mean - new_data$price)^2)
rmse <- sqrt(mse)
mape <- mean(abs((forecast_values$mean - new_data$price) / new_data$price) * 100, na.rm = TRUE)
wmape <- calculate_wmape(forecast_values$mean, new_data$price)
cat("MAE:", mae, "\n")
## MAE: 2.38107
cat("MSE:", mse, "\n")
## MSE: 6.112297
cat("RMSE:", rmse, "\n")
## RMSE: 2.472306
cat("MAPE:", mape, "\n")
## MAPE: 0.9054944
cat("WMAPE:",wmape, "\n")
## WMAPE: 0.9139018